Carbon Dioxide Emissions from the World Bank’s World Development Index

library(tidyverse)
library(DT)
library(plotly)
library(skimr)

Preparing the data

First we need to import the data and clean up any issues

Read in the file

df <- read_csv(here::here("data", "week11", "co2_global_emissions.csv"))

Take a look at the data

I use the str function to inspect the structure of the dataframe. There are two columns for the inidicator which in this case are all the same and can be removed. It also makes sense to convert the table to a long format instead of having each year represented by its own column.

str(df)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 264 obs. of  63 variables:
##  $ Country Name  : chr  "Aruba" "Afghanistan" "Angola" "Albania" ...
##  $ Country Code  : chr  "ABW" "AFG" "AGO" "ALB" ...
##  $ Indicator Name: chr  "CO2 emissions (metric tons per capita)" "CO2 emissions (metric tons per capita)" "CO2 emissions (metric tons per capita)" "CO2 emissions (metric tons per capita)" ...
##  $ Indicator Code: chr  "EN.ATM.CO2E.PC" "EN.ATM.CO2E.PC" "EN.ATM.CO2E.PC" "EN.ATM.CO2E.PC" ...
##  $ 1960          : num  NA 0.0461 0.0975 1.2582 NA ...
##  $ 1961          : num  NA 0.0536 0.079 1.3742 NA ...
##  $ 1962          : num  NA 0.0738 0.2013 1.44 NA ...
##  $ 1963          : num  NA 0.0742 0.1925 1.1817 NA ...
##  $ 1964          : num  NA 0.0863 0.201 1.1117 NA ...
##  $ 1965          : num  NA 0.101 0.192 1.166 NA ...
##  $ 1966          : num  NA 0.108 0.246 1.333 NA ...
##  $ 1967          : num  NA 0.124 0.155 1.364 NA ...
##  $ 1968          : num  NA 0.115 0.256 1.52 NA ...
##  $ 1969          : num  NA 0.0868 0.4196 1.559 NA ...
##  $ 1970          : num  NA 0.15 0.529 1.753 NA ...
##  $ 1971          : num  NA 0.166 0.492 1.989 NA ...
##  $ 1972          : num  NA 0.131 0.635 2.516 NA ...
##  $ 1973          : num  NA 0.136 0.671 2.304 NA ...
##  $ 1974          : num  NA 0.156 0.652 1.849 NA ...
##  $ 1975          : num  NA 0.169 0.575 1.911 NA ...
##  $ 1976          : num  NA 0.155 0.416 2.014 NA ...
##  $ 1977          : num  NA 0.183 0.435 2.276 NA ...
##  $ 1978          : num  NA 0.163 0.646 2.531 NA ...
##  $ 1979          : num  NA 0.168 0.637 2.898 NA ...
##  $ 1980          : num  NA 0.133 0.599 1.935 NA ...
##  $ 1981          : num  NA 0.152 0.571 2.693 NA ...
##  $ 1982          : num  NA 0.165 0.485 2.625 NA ...
##  $ 1983          : num  NA 0.204 0.515 2.683 NA ...
##  $ 1984          : num  NA 0.235 0.487 2.694 NA ...
##  $ 1985          : num  NA 0.298 0.443 2.658 NA ...
##  $ 1986          : num  2.868 0.271 0.427 2.665 NA ...
##  $ 1987          : num  7.235 0.272 0.518 2.414 NA ...
##  $ 1988          : num  10.026 0.248 0.446 2.332 NA ...
##  $ 1989          : num  10.635 0.236 0.424 2.783 NA ...
##  $ 1990          : num  26.375 0.213 0.42 1.678 7.467 ...
##  $ 1991          : num  26.046 0.188 0.405 1.312 7.182 ...
##  $ 1992          : num  21.4426 0.0997 0.4007 0.7747 6.9121 ...
##  $ 1993          : num  22.0008 0.0892 0.4309 0.7238 6.7361 ...
##  $ 1994          : num  21.036 0.08 0.281 0.6 6.494 ...
##  $ 1995          : num  20.7719 0.0727 0.7692 0.6545 6.6621 ...
##  $ 1996          : num  20.318 0.066 0.712 0.637 7.065 ...
##  $ 1997          : num  20.4268 0.0596 0.4892 0.4904 7.2397 ...
##  $ 1998          : num  20.5877 0.0552 0.4714 0.5603 7.6608 ...
##  $ 1999          : num  20.3116 0.0423 0.5741 0.9602 7.9755 ...
##  $ 2000          : num  26.1949 0.0385 0.5804 0.9782 8.0193 ...
##  $ 2001          : num  25.934 0.039 0.573 1.053 7.787 ...
##  $ 2002          : num  25.6712 0.0487 0.7208 1.2295 7.5906 ...
##  $ 2003          : num  26.4205 0.0518 0.498 1.4127 7.3158 ...
##  $ 2004          : num  26.5173 0.0394 0.9962 1.3762 7.3586 ...
##  $ 2005          : num  27.2007 0.0529 0.9797 1.4125 7.2999 ...
##  $ 2006          : num  26.9483 0.0637 1.0989 1.3026 6.7462 ...
##  $ 2007          : num  27.8956 0.0854 1.1978 1.3223 6.5195 ...
##  $ 2008          : num  26.231 0.154 1.182 1.484 6.428 ...
##  $ 2009          : num  25.916 0.242 1.232 1.496 6.122 ...
##  $ 2010          : num  24.671 0.294 1.243 1.579 6.123 ...
##  $ 2011          : num  24.506 0.412 1.253 1.804 5.867 ...
##  $ 2012          : num  13.16 0.35 1.33 1.69 5.92 ...
##  $ 2013          : num  8.351 0.316 1.255 1.749 5.901 ...
##  $ 2014          : num  8.408 0.299 1.291 1.979 5.832 ...
##  $ 2015          : logi  NA NA NA NA NA NA ...
##  $ 2016          : logi  NA NA NA NA NA NA ...
##  $ 2017          : logi  NA NA NA NA NA NA ...
##  $ 2018          : logi  NA NA NA NA NA NA ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `Country Name` = col_character(),
##   ..   `Country Code` = col_character(),
##   ..   `Indicator Name` = col_character(),
##   ..   `Indicator Code` = col_character(),
##   ..   `1960` = col_double(),
##   ..   `1961` = col_double(),
##   ..   `1962` = col_double(),
##   ..   `1963` = col_double(),
##   ..   `1964` = col_double(),
##   ..   `1965` = col_double(),
##   ..   `1966` = col_double(),
##   ..   `1967` = col_double(),
##   ..   `1968` = col_double(),
##   ..   `1969` = col_double(),
##   ..   `1970` = col_double(),
##   ..   `1971` = col_double(),
##   ..   `1972` = col_double(),
##   ..   `1973` = col_double(),
##   ..   `1974` = col_double(),
##   ..   `1975` = col_double(),
##   ..   `1976` = col_double(),
##   ..   `1977` = col_double(),
##   ..   `1978` = col_double(),
##   ..   `1979` = col_double(),
##   ..   `1980` = col_double(),
##   ..   `1981` = col_double(),
##   ..   `1982` = col_double(),
##   ..   `1983` = col_double(),
##   ..   `1984` = col_double(),
##   ..   `1985` = col_double(),
##   ..   `1986` = col_double(),
##   ..   `1987` = col_double(),
##   ..   `1988` = col_double(),
##   ..   `1989` = col_double(),
##   ..   `1990` = col_double(),
##   ..   `1991` = col_double(),
##   ..   `1992` = col_double(),
##   ..   `1993` = col_double(),
##   ..   `1994` = col_double(),
##   ..   `1995` = col_double(),
##   ..   `1996` = col_double(),
##   ..   `1997` = col_double(),
##   ..   `1998` = col_double(),
##   ..   `1999` = col_double(),
##   ..   `2000` = col_double(),
##   ..   `2001` = col_double(),
##   ..   `2002` = col_double(),
##   ..   `2003` = col_double(),
##   ..   `2004` = col_double(),
##   ..   `2005` = col_double(),
##   ..   `2006` = col_double(),
##   ..   `2007` = col_double(),
##   ..   `2008` = col_double(),
##   ..   `2009` = col_double(),
##   ..   `2010` = col_double(),
##   ..   `2011` = col_double(),
##   ..   `2012` = col_double(),
##   ..   `2013` = col_double(),
##   ..   `2014` = col_double(),
##   ..   `2015` = col_logical(),
##   ..   `2016` = col_logical(),
##   ..   `2017` = col_logical(),
##   ..   `2018` = col_logical()
##   .. )

Fix the column names

Here I convert the column names to camel case

names(df) <- names(df) %>% tolower() %>% str_replace(" ", "_")

Let’s take another look

The skimr package contains tools for summarising various data structures in R.

df %>% skim_to_wide()
type variable missing complete n min max empty n_unique mean count sd p0 p25 p50 p75 p100 hist
character country_code 0 264 264 3 3 0 264 NA NA NA NA NA NA NA NA NA
character country_name 0 264 264 4 52 0 264 NA NA NA NA NA NA NA NA NA
character indicator_code 0 264 264 14 14 0 1 NA NA NA NA NA NA NA NA NA
character indicator_name 0 264 264 38 38 0 1 NA NA NA NA NA NA NA NA NA
logical 2015 264 0 264 NA NA NA NA NaN 264 NA NA NA NA NA NA NA
logical 2016 264 0 264 NA NA NA NA NaN 264 NA NA NA NA NA NA NA
logical 2017 264 0 264 NA NA NA NA NaN 264 NA NA NA NA NA NA NA
logical 2018 264 0 264 NA NA NA NA NaN 264 NA NA NA NA NA NA NA
numeric 1960 72 192 264 NA NA NA NA 2.04 NA 4.19 0.008 0.18 0.62 1.7 36.69 ▇▁▁▁▁▁▁▁
numeric 1961 71 193 264 NA NA NA NA 2.16 NA 4.38 0.0079 0.18 0.65 1.75 36.58 ▇▁▁▁▁▁▁▁
numeric 1962 69 195 264 NA NA NA NA 2.25 NA 4.72 0.0085 0.2 0.65 1.94 42.24 ▇▁▁▁▁▁▁▁
numeric 1963 68 196 264 NA NA NA NA 2.76 NA 8.46 0.0094 0.2 0.65 1.72 99.46 ▇▁▁▁▁▁▁▁
numeric 1964 61 203 264 NA NA NA NA 2.91 NA 8.49 0.012 0.22 0.77 2.02 92.86 ▇▁▁▁▁▁▁▁
numeric 1965 61 203 264 NA NA NA NA 3.03 NA 8.7 0.012 0.23 0.7 2.19 85.46 ▇▁▁▁▁▁▁▁
numeric 1966 61 203 264 NA NA NA NA 3.04 NA 8.05 0.013 0.25 0.75 2.46 78.63 ▇▁▁▁▁▁▁▁
numeric 1967 61 203 264 NA NA NA NA 3.11 NA 7.26 0.012 0.25 0.8 2.92 77.51 ▇▁▁▁▁▁▁▁
numeric 1968 61 203 264 NA NA NA NA 3.31 NA 7.18 -0.02 0.27 0.99 3.26 75.98 ▇▁▁▁▁▁▁▁
numeric 1969 61 203 264 NA NA NA NA 3.92 NA 9.9 0.016 0.32 1.06 3.6 100.7 ▇▁▁▁▁▁▁▁
numeric 1970 59 205 264 NA NA NA NA 4.2 NA 9.25 0.012 0.35 1 4.01 69.11 ▇▁▁▁▁▁▁▁
numeric 1971 58 206 264 NA NA NA NA 4.42 NA 9.87 0.012 0.34 1.1 4.5 76.64 ▇▁▁▁▁▁▁▁
numeric 1972 56 208 264 NA NA NA NA 4.49 NA 10.13 0.012 0.35 1.11 4.52 82.62 ▇▁▁▁▁▁▁▁
numeric 1973 56 208 264 NA NA NA NA 4.81 NA 10.75 0.011 0.37 1.14 5.12 87.65 ▇▁▁▁▁▁▁▁
numeric 1974 56 208 264 NA NA NA NA 4.5 NA 9.18 0.0097 0.37 1.23 4.64 68.23 ▇▁▁▁▁▁▁▁
numeric 1975 56 208 264 NA NA NA NA 4.37 NA 8.45 0.0097 0.38 1.29 4.85 66.64 ▇▁▁▁▁▁▁▁
numeric 1976 56 208 264 NA NA NA NA 4.36 NA 8.05 0.0099 0.36 1.36 5.17 61.29 ▇▁▁▁▁▁▁▁
numeric 1977 56 208 264 NA NA NA NA 4.49 NA 7.99 0.01 0.39 1.44 5.29 54.41 ▇▂▁▁▁▁▁▁
numeric 1978 56 208 264 NA NA NA NA 4.51 NA 8.03 0.0074 0.4 1.52 5.75 54.83 ▇▂▁▁▁▁▁▁
numeric 1979 56 208 264 NA NA NA NA 4.56 NA 7.95 0.0043 0.44 1.58 5.49 69.94 ▇▁▁▁▁▁▁▁
numeric 1980 56 208 264 NA NA NA NA 4.46 NA 7.41 0.036 0.45 1.53 5.49 58.53 ▇▂▁▁▁▁▁▁
numeric 1981 56 208 264 NA NA NA NA 3.99 NA 6.19 0.03 0.47 1.52 5.31 51.83 ▇▂▁▁▁▁▁▁
numeric 1982 56 208 264 NA NA NA NA 3.87 NA 5.78 0.028 0.45 1.48 5.38 44.54 ▇▂▁▁▁▁▁▁
numeric 1983 56 208 264 NA NA NA NA 3.73 NA 5.29 0.031 0.45 1.37 5.41 36.41 ▇▂▁▁▁▁▁▁
numeric 1984 56 208 264 NA NA NA NA 3.82 NA 5.56 0.041 0.48 1.45 5.26 36.12 ▇▂▁▁▁▁▁▁
numeric 1985 56 208 264 NA NA NA NA 3.92 NA 5.6 0.035 0.47 1.54 5.56 35.89 ▇▁▁▁▁▁▁▁
numeric 1986 55 209 264 NA NA NA NA 3.91 NA 5.53 0.036 0.44 1.55 4.98 33.41 ▇▂▁▁▁▁▁▁
numeric 1987 55 209 264 NA NA NA NA 3.94 NA 5.33 0.037 0.48 1.64 5.39 30.56 ▇▂▁▁▁▁▁▁
numeric 1988 55 209 264 NA NA NA NA 4.08 NA 5.48 0.012 0.51 1.76 5.8 29.21 ▇▂▁▁▁▁▁▁
numeric 1989 55 209 264 NA NA NA NA 4.21 NA 5.73 0.018 0.5 1.64 5.84 31.03 ▇▂▁▁▁▁▁▁
numeric 1990 49 215 264 NA NA NA NA 4.08 NA 5.63 0.024 0.46 1.67 5.91 27.96 ▇▂▂▁▁▁▁▁
numeric 1991 47 217 264 NA NA NA NA 4.12 NA 5.68 0.011 0.45 1.86 5.99 36.32 ▇▂▁▁▁▁▁▁
numeric 1992 23 241 264 NA NA NA NA 4.48 NA 6.02 0.013 0.57 2.28 6.47 54.09 ▇▂▁▁▁▁▁▁
numeric 1993 23 241 264 NA NA NA NA 4.5 NA 6.34 0.014 0.53 2.24 6.66 61.25 ▇▂▁▁▁▁▁▁
numeric 1994 22 242 264 NA NA NA NA 4.42 NA 6.24 0.015 0.57 2.19 6.45 59.6 ▇▂▁▁▁▁▁▁
numeric 1995 21 243 264 NA NA NA NA 4.47 NA 6.38 0.016 0.58 2.32 6.48 61.91 ▇▂▁▁▁▁▁▁
numeric 1996 21 243 264 NA NA NA NA 4.49 NA 6.25 0.017 0.62 2.4 6.76 61.84 ▇▂▁▁▁▁▁▁
numeric 1997 20 244 264 NA NA NA NA 4.49 NA 6.56 0.019 0.68 2.27 6.58 70.14 ▇▁▁▁▁▁▁▁
numeric 1998 19 245 264 NA NA NA NA 4.48 NA 6.18 0.019 0.7 2.25 6.55 58.87 ▇▂▁▁▁▁▁▁
numeric 1999 19 245 264 NA NA NA NA 4.45 NA 5.96 0.02 0.74 2.26 6.69 55.16 ▇▂▁▁▁▁▁▁
numeric 2000 19 245 264 NA NA NA NA 4.58 NA 6.37 0.017 0.74 2.34 6.61 58.64 ▇▂▁▁▁▁▁▁
numeric 2001 19 245 264 NA NA NA NA 4.63 NA 6.52 0.017 0.76 2.44 6.92 67.11 ▇▁▁▁▁▁▁▁
numeric 2002 18 246 264 NA NA NA NA 4.6 NA 6.31 0.019 0.76 2.5 6.95 63.35 ▇▂▁▁▁▁▁▁
numeric 2003 18 246 264 NA NA NA NA 4.73 NA 6.36 0.019 0.8 2.63 7.24 60.3 ▇▂▁▁▁▁▁▁
numeric 2004 18 246 264 NA NA NA NA 4.78 NA 6.29 0.023 0.84 2.66 7.13 56.59 ▇▂▁▁▁▁▁▁
numeric 2005 17 247 264 NA NA NA NA 4.82 NA 6.41 0.021 0.86 2.77 7.03 58.92 ▇▂▁▁▁▁▁▁
numeric 2006 16 248 264 NA NA NA NA 4.9 NA 6.56 0.024 0.8 2.92 7.04 62.82 ▇▂▁▁▁▁▁▁
numeric 2007 15 249 264 NA NA NA NA 4.93 NA 6.39 0.024 0.89 2.89 6.93 53.19 ▇▂▁▁▁▁▁▁
numeric 2008 15 249 264 NA NA NA NA 4.94 NA 6.18 0.023 0.82 3.03 7.01 46.67 ▇▂▁▁▁▁▁▁
numeric 2009 15 249 264 NA NA NA NA 4.72 NA 5.86 0.022 0.83 2.95 6.32 43.51 ▇▃▁▁▁▁▁▁
numeric 2010 15 249 264 NA NA NA NA 4.84 NA 5.89 0.024 0.82 2.93 6.64 40.74 ▇▂▁▁▁▁▁▁
numeric 2011 15 249 264 NA NA NA NA 4.81 NA 5.82 0.027 0.84 2.93 6.72 41.21 ▇▃▁▁▁▁▁▁
numeric 2012 13 251 264 NA NA NA NA 4.95 NA 6.19 0.03 0.83 3.03 6.66 44.62 ▇▃▁▁▁▁▁▁
numeric 2013 13 251 264 NA NA NA NA 4.86 NA 5.86 0.03 0.85 3.01 6.71 37.78 ▇▃▁▁▁▁▁▁
numeric 2014 14 250 264 NA NA NA NA 4.87 NA 6.1 0.044 0.88 3.15 6.37 45.42 ▇▂▁▁▁▁▁▁

Is anything missing

library(mi)
df %>% as.data.frame() %>% missing_data.frame() %>% image()

Those aren’t Countries!

There are quite a few regional and economic groups included as countries. I decided to remove these and keep just the countries and the value for the world as a whole. The datatable made it easier to browse through the data and find what I wanted to remove.

# Lookup table mapping country codes to names
country_names <- df %>%
  pull(country_name) %>%
  set_names(df %>% pull(country_code)) %>%
  sort()

df %>%
  select(country_code, country_name) %>%
  unique() %>%
  datatable()
non_countries <- c(
  "ARB", "CEB", "CSS", "EAP", "EAR", "EAS", "ECA", "ECS", "EMU",
  "EUU", "FCS", "HIC", "HPC", "IBD", "IBT", "IDA", "IDB", "IDX", "INX",
  "LAC", "LCN", "LDC", "LIC", "LMC", "LMY", "LTE", "MEA", "MIC", "MNA",
  "NAC", "OED", "OSS", "PRE", "PSS", "PST", "SAS", "SSA", "SSF", "SST",
  "TEA", "TEC", "TLA", "TMN", "TSA", "TSS", "UMC"
)
# remove the unwated rows
df <- df %>% filter(!(country_code %in% non_countries))

Reshape the table

Here I create a long data frame with columns indicating the country, year, and carbon dioxide emissions

long_df <- df %>%
  select(-indicator_code, -indicator_name, -country_name) %>%
  gather(key = "year", value = "co2_per_cap", -country_code) %>%
  mutate(year = as.integer(year))

skim_to_wide(long_df)
type variable missing complete n min max empty n_unique mean sd p0 p25 p50 p75 p100 hist
character country_code 0 12862 12862 3 3 0 218 NA NA NA NA NA NA NA NA
integer year 0 12862 12862 NA NA NA NA 1989 17.03 1960 1974 1989 2004 2018 ▇▇▇▇▇▇▇▇
numeric co2_per_cap 2957 9905 12862 NA NA NA NA 4.4 7.43 -0.02 0.37 1.61 6.01 100.7 ▇▁▁▁▁▁▁▁

Clean up the missing data

I removed the years from 2015, which contained no data. I also created a separate data frame containg only the countries with complete records since 1960.

# 2015 and on don't have any data. rather than remove them by hand, This does so programatically
good_years <- long_df %>%
  group_by(year) %>%
  summarise(has_data = any(!is.na(co2_per_cap))) %>%
  filter(has_data) %>%
  pull(year)

long_df <- long_df %>% filter(year %in% good_years)

skim_to_wide(long_df)
type variable missing complete n min max empty n_unique mean sd p0 p25 p50 p75 p100 hist
character country_code 0 11990 11990 3 3 0 218 NA NA NA NA NA NA NA NA
integer year 0 11990 11990 NA NA NA NA 1987 15.88 1960 1973 1987 2001 2014 ▇▇▇▇▇▇▇▇
numeric co2_per_cap 2085 9905 11990 NA NA NA NA 4.4 7.43 -0.02 0.37 1.61 6.01 100.7 ▇▁▁▁▁▁▁▁
# create a dataframe with every available record
long_all <- long_df %>% filter(is.na(co2_per_cap))

# Choose only the countries that have complete records.
complete_countries <- long_df %>%
  group_by(country_code) %>%
  summarise(all_years = !any(is.na(co2_per_cap))) %>%
  filter(all_years) %>%
  pull(country_code)

long_df <- long_df %>%
  filter(country_code %in% complete_countries)

skim_to_wide(long_df)
type variable missing complete n min max empty n_unique mean sd p0 p25 p50 p75 p100 hist
character country_code 0 8250 8250 3 3 0 150 NA NA NA NA NA NA NA NA
integer year 0 8250 8250 NA NA NA NA 1987 15.88 1960 1973 1987 2001 2014 ▇▇▇▇▇▇▇▇
numeric co2_per_cap 0 8250 8250 NA NA NA NA 4.36 7.55 -0.02 0.35 1.58 5.93 100.7 ▇▁▁▁▁▁▁▁
long_df %>% datatable()

Time to take a look

Here I’ll take a look at the countries which have a complete record from 1960 to 2014 ### Everything

ggplot(data = long_df) +
  geom_line(aes(x = year, y = co2_per_cap, group = country_code))

This is pretty crowded and hard to see much on.

Countries with the Highest Emissions

These countries had the highest average over the course of the dataset

long_df %>%
  group_by(country_code) %>%
  summarise(co2_avg = mean(co2_per_cap)) %>%
  arrange(desc(co2_avg)) %>%
  head(10) %>%
  ggplot() +
  geom_col(aes(x = country_code, y = co2_avg)) +
  scale_x_discrete(name = "Country", labels = function(x) country_names[x])

A Closer Look at the Top Counries

long_df %>%
  group_by(country_code) %>%
  summarise(co2_avg = mean(co2_per_cap)) %>%
  arrange(desc(co2_avg)) %>%
  head(10) %>%
  pull(country_code) %>%
  lapply(
    function(x) ggplot(filter(long_df, country_code == x), aes(year, co2_per_cap)) +
        geom_line() +
        labs(title = country_names[x]) +
        coord_cartesian(ylim = c(0, 100))
  ) %>%
  walk(print)

There isn’t a consistent pattern amoung the countries with the highest per capita carbon dioxide emissions. Some are highly industrialized Western nations, which tend to have stable but high emissions. The others tend to be less stable over time, with spikes of extreme emission levels. Qatar is unique in that it’s baseline is consistently higher than anyone else’s and it has spikes to even more extreme levels

Main Visual

plot1 <- long_df %>%
  filter(
    country_code %in%
      c("USA", "CHN", "AUS", "CAN", "GBR", "IND", "JPN", "PAK", "VEN", "VNM")
  ) %>%
  mutate(Country = country_names[country_code]) %>%
  rename(`CO2 Emissions` = co2_per_cap, Year = year) %>%
  select(-country_code) %>%
  ggplot() +
  geom_line(aes(x = Year, y = `CO2 Emissions`, color = Country), size = 2) +
  scale_color_brewer(name = "Country", labels = function(x) country_names[x], palette = "Set3") +
  scale_y_continuous(name = "Tons of CO2 Per Capita") +
  scale_x_continuous(name = "Year", breaks = seq.int(from = 1960, to = 2015, by = 5)) +
  theme(
    plot.title = element_text(size = 24, vjust = .5, hjust = .5),
    axis.title = element_text(size = 16, hjust = .5, vjust = .5),
    panel.background = element_rect(fill = "white"),
    legend.title = element_text(face = "bold"),
    panel.grid.major.y = element_line(color = "grey75", size = .1, linetype = "solid")
  ) +
  ggtitle("Per Capita Carbon Dioxide Emissions")
ggplotly(plot1)

Essay

For this project I decided to work with the carbon dioxide emissions dataset from the World Bank. Global warming and it’s effects are going to become increasily influential and damaging over the course of my lifetime. Human activity and carbon dioxide released by the combustion of fossil fuels in particular have major impacts on the environment. This dataset tracks the amount of carbon dioxide emitted since 1960 in every country. Because of the breakup of the Soviet Union in the early 1990’s Several major countries including Germany and Russia were not in existence for the entirety of the dataset. I looked at counries with complete records since 1960, because I was interested in the early years of the data set and how various countries and the world have progressed over time.

This visualization shows the carbon dioxide emissions of some of the largest countries since 1960. It clearly shows the massive impact of the United States, even in relation to other similarly developed and industrialized nations. US Emissions have leveled off and slightly declined starting in the early 90’s. The U.S. still leads the way in terms of emissions, but other countries are quickly catching up. Some of these large countries have been able to reduce their emisions, but this is not the general trend. China has had massive growth in emissions as a result of its rapid industrialization and economic growth. Much of the world’s population lives in poorer, less developed nations that currently have little carbon output, but that will likely change as they industrialize.

Extra

It was hard to do very much with just the co2 emission numbers, so I wanted to around a little with the rest of the data from the World Develpment Index. This isn’t finished, but there were a few interesting things, so I left it in.

wb_df <- read_csv(here::here("data", "WDI", "WDIData.csv"))
names(wb_df) <- names(wb_df) %>% tolower() %>% str_replace(" ", "_")
wb_df %>% select(indicator_name, indicator_code) %>% unique() %>% datatable()
wb_names <- wb_df %>%
  select(country_code, country_name) %>%
  unique() %>%
  (function(x) x %>% pull(country_name) %>% set_names(x %>% pull(country_code))) %>%
  sort()
wb_df <- wb_df %>% filter(!(country_code %in% non_countries))
codes <- c(
  "AG.LND.TOTL.K2",
  "SP.URB.TOTL",
  "SP.RUR.TOTL",
  "AG.LND.AGRI.K2",
  "AG.LND.CROP.ZS",
  "SP.POP.TOTL",
  "EN.ATM.CO2E.KT",
  "NY.GDP.MKTP.KD",
  "AG.SRF.TOTL.K2",
  "ER.FSH.PROD.MT",
  "TX.VAL.MRCH.CD.WT",
  "TM.VAL.MRCH.CD.WT",
  "SP.DYN.CBRT.IN",
  "SP.DYN.CDRT.IN"
)

col_ids <- c(
  "total_land_area",
  "urban_population",
  "rural_population",
  "agriculture_land_area",
  "cropland_ratio",
  "population",
  "co2_emissions",
  "gdp",
  "surface_area",
  "fishery_production",
  "merchandise_exports",
  "merchandise_imports",
  "birth_rate",
  "death_rate"
) %>%
  set_names(codes)

world <- wb_df %>%
  filter(indicator_code %in% codes) %>%
  mutate(temp_col = col_ids[indicator_code]) %>%
  select(-indicator_code, -indicator_name, -x64, -`1960`) %>%
  gather(key = "year", value = "value", -country_code, -temp_col, -country_name) %>%
  mutate(year = as.integer(year)) %>%
  spread(temp_col, value) %>%
  mutate(
    cropland_area = total_land_area * cropland_ratio,
    births = floor(birth_rate * population / 1000),
    deaths = floor(death_rate * population / 1000)
  ) %>%
  select(-birth_rate, -death_rate, -cropland_ratio)


w_years <- world %>%
  group_by(year) %>%
  summarise(has_data = any(!is.na(co2_emissions))) %>%
  filter(has_data) %>%
  pull(year)

world <- world %>% filter(year %in% w_years)

ccc <- world %>%
  group_by(country_code) %>%
  summarise(all_years = !any(is.na(co2_emissions) | is.na(gdp) | is.na(agriculture_land_area))) %>%
  filter(all_years) %>%
  pull(country_code)

world <- world %>%
  filter(country_code %in% ccc) %>%
  mutate(
    cropland_area = if_else(is.na(cropland_area) & country_code == "HKG", 1000, cropland_area),
    merchandise_exports = if_else(is.na(merchandise_exports) & country_code == "BHS", lag(merchandise_exports), merchandise_exports),
    merchandise_imports = if_else(is.na(merchandise_imports) & country_code == "NPL", 75000000, merchandise_imports)
  )
world_total <- world %>% filter(country_code == "WLD")
model_data <- world %>% filter(country_code != "WLD")
model_data %>%
  select(-country_code, -country_name) %>%
  cor() %>%
  corrplot::corrplot(method = "number")

model_data %>%
  select(-country_code, -country_name) %>%
  GGally::ggpairs()

model_variables <- names(model_data)[
  !(names(model_data) %in% c("country_name", "country_code", "co2_emissions", "population"))
]
model_formula <- as.formula(
  paste0(
    "co2_emissions ~ ",
    paste(model_variables, collapse = "+")
  )
)
linear_model <- lm(formula = model_formula, data = model_data)
summary(linear_model)
## 
## Call:
## lm(formula = model_formula, data = model_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1543020   -23185    18769    54270  2132070 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            3.136e+06  4.235e+05   7.405 1.57e-13 ***
## year                  -1.591e+03  2.132e+02  -7.464 1.01e-13 ***
## agriculture_land_area  2.846e-01  1.145e-02  24.867  < 2e-16 ***
## fishery_production     3.731e-02  1.762e-03  21.176  < 2e-16 ***
## gdp                    2.623e-07  6.876e-09  38.150  < 2e-16 ***
## merchandise_exports    1.462e-07  1.051e-07   1.391    0.164    
## merchandise_imports   -4.789e-07  9.744e-08  -4.915 9.21e-07 ***
## rural_population       6.928e-04  1.339e-04   5.174 2.40e-07 ***
## surface_area           1.520e+00  5.916e-02  25.699  < 2e-16 ***
## total_land_area       -1.632e+00  6.412e-02 -25.454  < 2e-16 ***
## urban_population       6.289e-03  2.407e-04  26.129  < 2e-16 ***
## cropland_area         -3.750e-02  1.984e-03 -18.906  < 2e-16 ***
## births                -6.575e-02  5.293e-03 -12.421  < 2e-16 ***
## deaths                 2.197e-02  1.254e-02   1.751    0.080 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 197300 on 4252 degrees of freedom
## Multiple R-squared:  0.9265, Adjusted R-squared:  0.9263 
## F-statistic:  4122 on 13 and 4252 DF,  p-value: < 2.2e-16
broom::tidy(linear_model)
term estimate std.error statistic p.value
(Intercept) 3.136178e+06 4.235181e+05 7.405063 0.0000000
year -1.591368e+03 2.131922e+02 -7.464476 0.0000000
agriculture_land_area 2.846447e-01 1.144680e-02 24.866673 0.0000000
fishery_production 3.730960e-02 1.761900e-03 21.175523 0.0000000
gdp 3.000000e-07 0.000000e+00 38.149605 0.0000000
merchandise_exports 1.000000e-07 1.000000e-07 1.390780 0.1643649
merchandise_imports -5.000000e-07 1.000000e-07 -4.915044 0.0000009
rural_population 6.928000e-04 1.339000e-04 5.173862 0.0000002
surface_area 1.520484e+00 5.916480e-02 25.699124 0.0000000
total_land_area -1.632016e+00 6.411650e-02 -25.453909 0.0000000
urban_population 6.288900e-03 2.407000e-04 26.129286 0.0000000
cropland_area -3.750420e-02 1.983700e-03 -18.906058 0.0000000
births -6.574840e-02 5.293300e-03 -12.421116 0.0000000
deaths 2.197010e-02 1.254490e-02 1.751309 0.0799648
world_prediction <- world_total %>%
  select(-country_code, -country_name, -co2_emissions, -population) %>%
  predict(object = linear_model, newdata = .)
world_total %>%
  select(year, co2_emissions) %>%
  rename(actual = co2_emissions) %>%
  add_column(prediction = world_prediction) %>%
  gather(key = "type", value = "co2_tons", -year) %>%
  ggplot() +
  geom_point(aes(x = year, y = co2_tons, color = type))